#data cleanup
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(stringr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
#Visualization
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(ggplot2)
library(DT)
#Data
library(bea.R)
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
## Note: As of February 2018, beaGet() requires 'TableName' for NIPA and NIUnderlyingDetail data instead of 'TableID.' See https://github.us-bea/bea.R for details.
library(devtools)
## Loading required package: usethis
## Error in get(genname, envir = envir) : 找不到对象'testthat_print'
library(gtrendsR)
#Text Analysis
library(tidytext)
library(wordcloud)
## Loading required package: RColorBrewer
library(RColorBrewer)
#Forecasting
library(quantmod)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'xts'
## The following objects are masked from 'package:data.table':
##
## first, last
## The following objects are masked from 'package:dplyr':
##
## first, last
## Loading required package: TTR
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(forecast)
library(tseries)
library(prophet)
## Loading required package: Rcpp
## Loading required package: rlang
##
## Attaching package: 'rlang'
## The following object is masked from 'package:data.table':
##
## :=
## The following object is masked from 'package:magrittr':
##
## set_names
data cleaning
# load the dataset
library(tidyverse)
## ─ Attaching packages ─────────────────────────────────────────────── tidyverse 1.3.0 ─
## ✓ tibble 3.0.1 ✓ purrr 0.3.4
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ─ Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ─
## x rlang:::=() masks data.table:::=()
## x purrr::%@%() masks rlang::%@%()
## x purrr::as_function() masks rlang::as_function()
## x lubridate::as.difftime() masks base::as.difftime()
## x data.table::between() masks dplyr::between()
## x lubridate::date() masks base::date()
## x magrittr::extract() masks tidyr::extract()
## x plotly::filter() masks dplyr::filter(), stats::filter()
## x xts::first() masks data.table::first(), dplyr::first()
## x purrr::flatten() masks rlang::flatten()
## x purrr::flatten_chr() masks rlang::flatten_chr()
## x purrr::flatten_dbl() masks rlang::flatten_dbl()
## x purrr::flatten_int() masks rlang::flatten_int()
## x purrr::flatten_lgl() masks rlang::flatten_lgl()
## x purrr::flatten_raw() masks rlang::flatten_raw()
## x data.table::hour() masks lubridate::hour()
## x lubridate::intersect() masks base::intersect()
## x purrr::invoke() masks rlang::invoke()
## x data.table::isoweek() masks lubridate::isoweek()
## x dplyr::lag() masks stats::lag()
## x xts::last() masks data.table::last(), dplyr::last()
## x purrr::list_along() masks rlang::list_along()
## x data.table::mday() masks lubridate::mday()
## x data.table::minute() masks lubridate::minute()
## x purrr::modify() masks rlang::modify()
## x data.table::month() masks lubridate::month()
## x purrr::prepend() masks rlang::prepend()
## x data.table::quarter() masks lubridate::quarter()
## x data.table::second() masks lubridate::second()
## x purrr::set_names() masks rlang::set_names(), magrittr::set_names()
## x lubridate::setdiff() masks base::setdiff()
## x purrr::splice() masks rlang::splice()
## x purrr::transpose() masks data.table::transpose()
## x lubridate::union() masks base::union()
## x data.table::wday() masks lubridate::wday()
## x data.table::week() masks lubridate::week()
## x data.table::yday() masks lubridate::yday()
## x data.table::year() masks lubridate::year()
stock = read.csv("BitcoinStockMarketHistoricalData_2014_2021.csv", stringsAsFactors=FALSE)
stock$Date = as.Date(stock$Date, format = "%m/%d/%Y")
stock$Open = as.numeric(stock$Open)
## Warning: 强制改变过程中产生了NA
stock$High = as.numeric(stock$High)
## Warning: 强制改变过程中产生了NA
stock$Low = as.numeric(stock$Low)
## Warning: 强制改变过程中产生了NA
stock$Close = as.numeric(stock$Close)
## Warning: 强制改变过程中产生了NA
stock$Adj.Close = as.numeric(stock$Adj.Close)
## Warning: 强制改变过程中产生了NA
stock$Volume = as.numeric(stock$Volume)
## Warning: 强制改变过程中产生了NA
# remove missing values
stock = na.omit(stock)
sum(is.na(stock))
## [1] 0
head(stock,6)
## Date Open High Low Close Adj.Close Volume
## 1 2014-09-17 465.864 468.174 452.422 457.334 457.334 21056800
## 2 2014-09-18 456.860 456.860 413.104 424.440 424.440 34483200
## 3 2014-09-19 424.103 427.835 384.532 394.796 394.796 37919700
## 4 2014-09-20 394.673 423.296 389.883 408.904 408.904 36863600
## 5 2014-09-21 408.085 412.426 393.181 398.821 398.821 26580100
## 6 2014-09-22 399.100 406.916 397.130 402.152 402.152 24127600
Visualization
p1 <- stock %>%
plot_ly(x = ~Date,
type = "candlestick",
open = ~Open,
close = ~Close,
high = ~High,
low = ~Low,
name = "price") %>%
layout(
xaxis = list(
rangeselector = list(
buttons = list(
list(
count = 1,
label = "1 mo",
step = "week",
stepmode = "backward"),
list(
count = 3,
label = "3 mo",
step = "month",
stepmode = "backward"),
list(
count = 6,
label = "6 mo",
step = "month",
stepmode = "backward"),
list(
count = 1,
label = "1 yr",
step = "year",
stepmode = "backward"),
list(
count = 3,
label = "3 yr",
step = "year",
stepmode = "backward"),
list(step = "all"))),
rangeslider = list(visible = FALSE)),
yaxis = list(title = "Price ($)",
showgrid = TRUE,
showticklabels = TRUE))
p2 <- stock %>%
plot_ly(x=~Date, y=~Volume, type='bar', name = "Volume") %>%
layout(yaxis = list(title = "Volume"))
p <- subplot(p1, p2, heights = c(0.7,0.3), nrows=2,
shareX = TRUE, titleY = TRUE) %>%
layout(title = "Bitcoin")
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
p
Initial Analysis
bit_ts = stock %>%
filter(Date > as.Date('2017-01-01')) %>%
arrange(Date) %>%
select(Adj.Close) %>%
as.matrix() %>%
ts()
my_theme = theme(panel.grid = element_line(color = '#e6e6e6'),
panel.background = element_rect(fill = 'white'),
plot.title = element_text(hjust = .5, size = 28, colour = '#ffa500'),
text = element_text(family = 'Georgia'),
axis.text = element_text(size = 10),
axis.title = element_text(size = 18, family = 'Georgia', face = 'bold'),
axis.line = element_line(colour = '#737373', size = 1),
strip.background = element_rect(colour = "black", fill = "white"),
strip.text = element_text(face = 'bold'))
gglagplot(bit_ts, do.lines = F) + my_theme +
scale_color_continuous(low = "#b37400", high = "#ffc04d", breaks = c(1, 366, 731, 1096, 1462), labels = c('2017', '2018', '2019','2020','2021')) +
scale_y_continuous(breaks = c(0, 15000, 30000, 45000),
labels = c('$0', '$15,000', '$30,000', '$45,000')) +
scale_x_continuous(breaks = c(30000, 60000),
labels = c('$30,000', '$60,000'))

ggAcf(bit_ts, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')

ggPacf(bit_ts, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

ggAcf(diff(bit_ts), lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')

ggPacf(diff(bit_ts), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

The First Difference
cut_bit_df = stock %>%
filter(Date > as.Date('2017-01-01'))
ggplotly(cut_bit_df[-1,] %>%
mutate(Price = diff(cut_bit_df$Adj.Close)) %>%
ggplot(aes(Date, Price)) + geom_line(col = '#ffa500') + my_theme +
labs(x = '', title = 'Bitcoin Differenced By One', y = 'Difference'))
Model Fitting
bit_ts_tran = BoxCox(bit_ts, lambda = BoxCox.lambda(bit_ts))
ggAcf(diff(bit_ts_tran), lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')

ggPacf(diff(bit_ts_tran), lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

auto.arima(bit_ts_tran)
## Series: bit_ts_tran
## ARIMA(3,1,1) with drift
##
## Coefficients:
## ar1 ar2 ar3 ma1 drift
## 0.8974 0.0609 -0.0209 -0.9251 0.0012
## s.e. 0.1262 0.0346 0.0294 0.1235 0.0006
##
## sigma^2 estimated as 0.000445: log likelihood=3706.04
## AIC=-7400.08 AICc=-7400.02 BIC=-7368.13
checkresiduals(auto.arima(bit_ts_tran))

##
## Ljung-Box test
##
## data: Residuals from ARIMA(3,1,1) with drift
## Q* = 13.263, df = 5, p-value = 0.02103
##
## Model df: 5. Total lags used: 10
model again
cut2_bit_df = stock %>%
filter(Date >= ymd('2019-01-01'))
ggplotly(cut2_bit_df %>%
mutate(Price = BoxCox(cut2_bit_df$Adj.Close, lambda = BoxCox.lambda(cut2_bit_df$Adj.Close))) %>%
ggplot(aes(Date,Adj.Close)) + geom_line(col = '#ffa500') +
labs(title = 'Bitcoin', x = '', y = 'Price (Transformed)') + my_theme)
bit_ts2 = stock %>%
filter(Date >= as.Date('2019-01-01')) %>%
arrange(Date) %>%
select(Adj.Close) %>%
as.matrix() %>%
ts()
bit_ts_tran2 = BoxCox(bit_ts2, lambda = BoxCox.lambda(bit_ts2))
ggAcf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')

ggPacf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

auto.arima(bit_ts_tran2)
## Series: bit_ts_tran2
## ARIMA(1,1,1) with drift
##
## Coefficients:
## ar1 ma1 drift
## -0.8011 0.7390 0.0012
## s.e. 0.1624 0.1827 0.0005
##
## sigma^2 estimated as 0.0002113: log likelihood=2220.29
## AIC=-4432.58 AICc=-4432.53 BIC=-4413.9
checkresiduals(auto.arima(bit_ts_tran2))

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1) with drift
## Q* = 8.0078, df = 7, p-value = 0.3319
##
## Model df: 3. Total lags used: 10
autoplot(forecast(auto.arima(bit_ts_tran2)))

cut2_bit_df = stock %>%
filter(Date >= ymd('2018-01-01') & Date <= ymd('2019-01-01'))
ggplotly(cut2_bit_df %>%
mutate(Price = BoxCox(cut2_bit_df$Adj.Close, lambda = BoxCox.lambda(cut2_bit_df$Adj.Close))) %>%
ggplot(aes(Date,Adj.Close)) + geom_line(col = '#ffa500') +
labs(title = 'Bitcoin', x = '', y = 'Price (Transformed)') + my_theme)
bit_ts2 = stock %>%
filter(Date >= as.Date('2018-01-01') & Date <= as.Date('2019-01-01')) %>%
arrange(Date) %>%
select(Adj.Close) %>%
as.matrix() %>%
ts()
bit_ts_tran2 = BoxCox(bit_ts2, lambda = BoxCox.lambda(bit_ts2))
ggAcf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'ACF' , y = 'Correlation')

ggPacf(bit_ts_tran2, lag.max = 200) + my_theme + labs(title = 'PACF', y = '')

auto.arima(bit_ts_tran2)
## Series: bit_ts_tran2
## ARIMA(1,1,0) with drift
##
## Coefficients:
## ar1 drift
## -0.0414 -5e-04
## s.e. 0.0526 3e-04
##
## sigma^2 estimated as 3.554e-05: log likelihood=1353.01
## AIC=-2700.02 AICc=-2699.96 BIC=-2688.32
checkresiduals(auto.arima(bit_ts_tran2))

##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,0) with drift
## Q* = 11.636, df = 8, p-value = 0.1682
##
## Model df: 2. Total lags used: 10
summary(Arima(bit_ts_tran2, order = c(1,1,1), include.drift = T))
## Series: bit_ts_tran2
## ARIMA(1,1,1) with drift
##
## Coefficients:
## ar1 ma1 drift
## -0.3597 0.3093 -5e-04
## s.e. 0.4810 0.4852 3e-04
##
## sigma^2 estimated as 3.559e-05: log likelihood=1353.23
## AIC=-2698.46 AICc=-2698.35 BIC=-2682.86
##
## Training set error measures:
## ME RMSE MAE MPE MAPE
## Training set 1.242138e-05 0.005933249 0.004201422 0.0001799258 0.108142
## MASE ACF1
## Training set 1.007819 0.0153384
err = residuals(Arima(bit_ts_tran2, order = c(1,1,1), include.drift = T))
invers_BoxCox = function(ts_data, lambda){
original_ts = (ts_data * lambda + 1) ** (1/lambda)
return(original_ts)
}
invers_BoxCox(sd(err), BoxCox.lambda(bit_ts))
## [1] 1.00596
text data
# load the dataset
news = read.csv("BitcoinNews_2021_1.csv", fileEncoding = "UTF-8", stringsAsFactors=FALSE)
news$Date = as.Date(news$Date, format = "%m/%d/%Y")
head(news,6)
## Date Source
## 1 2021-02-07 bbc-news
## 2 2021-02-08 engadget
## 3 2021-02-08 techcrunch
## 4 2021-02-08 None
## 5 2021-02-08 techcrunch
## 6 2021-02-08 reuters
## Title
## 1 FCA warning over risky TikTok trading tips
## 2 Tesla buys in Bitcoin will soon accept it as payment
## 3 Tesla buys B in bitcoin may accept the cryptocurrency as payment in the future
## 4 Impeachment Trial Child Tax Credit Bitcoin Your Monday Evening Briefing
## 5 Daily Crunch DoorDash acquires Chowbotics
## 6 FOREX Dollar steadies after U S jobs related losses Reuters
## Headline
## 1 Some TikTok creators have been giving investment advice wake GameStop frenzy
## 2 Elon Musk cryptocurrency hype more than just idle talk CNBC reports that Tesla only bought billion worth Bitcoin help diversify maximize returns will start taking payments using digital asset sometime near futu
## 3 Today filing Tesla disclosed that acquired billion bitcoin popular cryptocurrency Moreover company noted that also accept bitcoin future form payment cars though allow that there
## 4 Here what need know
## 5 DoorDash acquires salad making robotics startup Twitter confirms subscription plans Tesla makes bitcoin This your Daily Crunch February story DoorDash acquires Chowbotics DoorDash acquired Area startup
## 6 Dollar index little changed after Friday payrolls fall Jobs data takes shine dollar rebound Ethereum gains futures debut Bitcoin hits record high after Tesla purchase Graphic World rates https tmsnrt RBWI Adds details Bitcoin
news_text = news$Headline
# cleaning
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
news_source = Corpus(VectorSource(as.vector(news_text)))
news_corpus = tm_map(news_source, tolower) #transfer to lowercase
## Warning in tm_map.SimpleCorpus(news_source, tolower): transformation drops
## documents
news_corpus = tm_map(news_corpus, removeNumbers) # remove numbers
## Warning in tm_map.SimpleCorpus(news_corpus, removeNumbers): transformation drops
## documents
news_corpus = tm_map(news_corpus, removePunctuation) # remove punctuation
## Warning in tm_map.SimpleCorpus(news_corpus, removePunctuation): transformation
## drops documents
news_corpus = tm_map(news_corpus, stripWhitespace) # strip white spaces
## Warning in tm_map.SimpleCorpus(news_corpus, stripWhitespace): transformation
## drops documents
news_corpus = tm_map(news_corpus, stemDocument, language = "english") # remove common word endings
## Warning in tm_map.SimpleCorpus(news_corpus, stemDocument, language = "english"):
## transformation drops documents
news_corpus = tm_map(news_corpus, removeWords, stopwords("english")) # remove useless words
## Warning in tm_map.SimpleCorpus(news_corpus, removeWords, stopwords("english")):
## transformation drops documents
News Sentiment Analysis
news_words <- news %>%
select(c("Date","Source", "Title", "Headline")) %>%
unnest_tokens(word, Headline) %>%
filter(!word %in% append(stop_words$word, values = "chars"), str_detect(word, "^[a-z']+$"))
news_words$date = news_words$Date
words_only <- news_words %>%
count(word, sort =TRUE)
set.seed(1)
wordcloud(words = words_only$word, freq = words_only$n, scale=c(5,.5), max.words=50, colors=brewer.pal(8, "Dark2"))

wordcloud
## Build a term-document matrix
news_dtm =DocumentTermMatrix(news_corpus)
news_dtm = as.matrix(news_dtm)
## Print out the top 10 most frequent words
WordFreq = colSums(news_dtm)
ord = order(WordFreq)
WordFreq[tail(ord,10)]
## cnbc elon musk high record billion invest
## 21 21 25 25 29 37 38
## cryptocurr tesla bitcoin
## 45 76 189
## Print out the number of words in each document
Row_Sum_Per_doc = rowSums(news_dtm)
print (Row_Sum_Per_doc)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 8 29 21 2 24 32 23 21 16 13 19 19 22 23 22 24 23 23 25 23
## 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
## 26 11 11 10 22 12 14 14 14 12 22 22 16 12 26 26 26 25 7 5
## 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 19 10 12 26 16 24 24 22 9 7 26 26 24 24 9 24 22 20 24 13
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80
## 22 11 11 6 18 10 23 14 14 14 15 15 18 23 23 4 24 22 24 22
## 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## 25 24 25 26 23 27 12 12 25 8 5 21 21 12 19 18 14 24 14 27
## 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
## 12 16 21 24 23 26 8 18 26 13 24 24 9 23 27 12 21 24 8 22
## 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140
## 24 21 10 19 23 6 6 10 21 5 15 15 26 26 8 27 21 26 26 12
## 141 142 143 144 145 146
## 5 12 8 15 10 5
#wordcloud
library(wordcloud)
v = sort(colSums(news_dtm),decreasing=TRUE) # get word frequency in Hamilton files
d = data.frame(word = names(v),freq=v) # create a data frame
write.csv(d, file="words.csv")
head(d,10)
## word freq
## bitcoin bitcoin 189
## tesla tesla 76
## cryptocurr cryptocurr 45
## invest invest 38
## billion billion 37
## record record 29
## musk musk 25
## high high 25
## cnbc cnbc 21
## elon elon 21
set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 5,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))

News Sentiment over Time
library(textdata)
afinn <- get_sentiments("afinn")
sentiment_summary <- news_words %>%
left_join(afinn) %>%
filter(!is.na(value)) %>%
group_by(Title, Date) %>%
summarise(score = mean(value)) %>%
mutate(sentiment = ifelse(score>0, "positive","negative"))
## Joining, by = "word"
## `summarise()` regrouping output by 'Title' (override with `.groups` argument)
datatable(sentiment_summary)
# plot
ggplot(sentiment_summary, aes(Date, score)) +
geom_bar(stat = "identity", aes(fill=sentiment)) +
ggtitle("Bitcoin: News Sentiment Over Time")
